package com.chance.cc.crawler.prod.command.job.domain.news.sina.module;

import com.alibaba.fastjson.JSON;
import com.chance.cc.crawler.core.CrawlerEnum;
import com.chance.cc.crawler.core.CrawlerJob;
import com.chance.cc.crawler.core.downloader.HttpConfig;
import com.chance.cc.crawler.core.downloader.HttpConstant;
import com.chance.cc.crawler.core.downloader.HttpPage;
import com.chance.cc.crawler.core.downloader.HttpRequestBody;
import com.chance.cc.crawler.core.filter.FilterUtils;
import com.chance.cc.crawler.core.record.CrawlerRecord;
import com.chance.cc.crawler.core.record.CrawlerRequestRecord;
import com.chance.cc.crawler.meta.core.bean.CrawlerMetaConstant;
import com.chance.cc.crawler.meta.core.bean.job.CrawlerScheduleJob;
import com.chance.cc.crawler.prod.command.job.domain.news.sina.SinaCommonCrawlerSchedulerJob;
import org.apache.commons.lang3.StringUtils;

import static com.chance.cc.crawler.core.CrawlerEnum.CrawlerRecordFilter.dateRange;
import static com.chance.cc.crawler.core.CrawlerEnum.CrawlerRequestType.*;
import static com.chance.cc.crawler.development.scripts.allfeild.AICCommonField.Tag_Site_Info;

/**
 * @Author Zhao.Hhuan
 * @Date Create in 2021/4/8 19:05
 * @Description
 *      新浪板块 新媒体 实时
 **/
public class SinaEstationTraceCrawlerSchedulerJob extends SinaCommonCrawlerSchedulerJob {
    private static final String crawler_level = "trace";

    public static void main(String[] args) {
        publishCrawlerScheduleJobInfo();
//        System.out.println(crawlerSchedulejob().generateCrawlerTriggerKey());
    }

    public static CrawlerJob publishCrawlerScheduleJobInfo(){
        CrawlerJob crawlerJob = crawlerSchedulejob();

        //发布定时采集作业
        CrawlerScheduleJob crawlerScheduleJob = new CrawlerScheduleJob();
        crawlerScheduleJob.setDomain(domain);
        crawlerScheduleJob.setCrawlerJob(JSON.toJSONString(crawlerJob));
        crawlerScheduleJob.setJobType(CrawlerMetaConstant.ScheduleCrawlerJobType.crawler.enumVal());
        crawlerScheduleJob.setNote("新浪体育新媒体凌晨1点回溯7天定时采集");
        crawlerScheduleJob.setCrawlerKey(crawlerJob.generateCrawlerKey());
        HttpPage page = metaServiceCommand.addOrUpdateCrawlerScheduleJob(crawlerScheduleJob);
        System.out.println("发布crawler作业：" + page.getRawText());
        return crawlerJob;

    }

    public static CrawlerJob crawlerSchedulejob(){
        String url = "https://auto.sina.com.cn/estation/";
        String site = "auto";
        String siteInfo = "estation";
        String siteBiz = "news-"+crawler_level;
        CrawlerRequestRecord crawlerRequestRecord = CrawlerRequestRecord.builder()
                .startPageRequest(domain, turnPage)
                .domain(domain)
                .httpUrl(url)
                .httpConfig(HttpConfig.me(domain))
                .filter(dateRange)
                .addFilterInfo(FilterUtils.dateRangeFilterInfo(24 * 7 + 2 ,null))
                .releaseTime(System.currentTimeMillis())
                .resultLabelTag(CrawlerEnum.CrawlerDataType.article)
                .resultLabelTag(CrawlerEnum.CrawlerDataType.interaction)
                .resultLabelTag(CrawlerEnum.CrawlerDataType.comment)
                .needParsed(false)
                .proxy(proxy)
                .build();
        crawlerRequestRecord.setDownload(false);
        crawlerRequestRecord.setSkipPipeline(true);
        crawlerRequestRecord.tagsCreator().bizTags().addDomain(domain);
        crawlerRequestRecord.tagsCreator().bizTags().addSite(site);
        crawlerRequestRecord.tagsCreator().bizTags().addSiteBiz(siteBiz);
        crawlerRequestRecord.tagsCreator().bizTags().addCustomKV(Tag_Site_Info,siteInfo);

        CrawlerRecord commentFilter = new CrawlerRequestRecord();
        commentFilter.setFilter(dateRange);
        commentFilter.addFilterInfo(FilterUtils.dateRangeFilterInfo(24 + 2,null));
        crawlerRequestRecord.tagsCreator().resultTags().getCategoryTag().addKVTag("comment_filter_record", JSON.toJSONString(commentFilter));

        //获取地址的时候需要添加site确定拿取到的地址为本次需要的
        String requestQueueName = StringUtils.joinWith("-","crawler",domain,site,siteInfo,siteBiz,"queue");
        CrawlerRequestRecord initCrawlerRecord = CrawlerRequestRecord.builder()
                .startPageRequest("sina_estation_item_request",turnPageItem)
                .httpUrl("http://"+metaServiceIp+":"+metaServicePort+"/crawler/domain/common/api/v1/"+domain+"/results/queue/init?requestQueueName="+requestQueueName + "&site=sina_estation_item_url&hourFromNow=170")
                .requestLabelTag(supportSource)
                .requestLabelTag(internalDownload)
                .build();
        HttpRequestBody jsonBody = HttpRequestBody.json(JSON.toJSONString(crawlerRequestRecord), "utf-8");
        initCrawlerRecord.getHttpRequest().setMethod(HttpConstant.Method.POST);
        initCrawlerRecord.getHttpRequest().setRequestBody(jsonBody);

        return CrawlerJob.builder()
                .crawlerJobThreadNumber(2)
                .triggerInfo(
                        domain,
                        CrawlerMetaConstant.ScheduleJobTrigger_Cron,
                        System.currentTimeMillis(),
                        StringUtils.joinWith("-",site,siteInfo,siteBiz,CrawlerMetaConstant.ScheduleJobTriggerJob_Realtime))
                .crawlerRequestQueue(CrawlerMetaConstant.redisRequestQueue(requestQueueName))
                .kafkaResultPipeline("kafka",kafkaTopic,null)
                .requestRecord(crawlerRequestRecord)
                .supportRecord(initCrawlerRecord)
                .build();
    }
}
